library(tidyverse)
## load crime data
load(file="data/crimerates.RData")
options(scipen=0, digits=7) #to standardize number of digits when exporting data. all data here is run using these options.

## load in arrest data
arrests <- readRDS(file="data/arrests-by-city.RDS")

## load economic data
load(file="data/econ.RData")

## load demographic data
load(file = "data/demog.RData")

## combine demog and econ for controls
controls <- left_join(econ,demog,
                      by=c("Id2" = "Id2","state"="state",
                           "city"="city","year"="year"))
tmp <- controls %>% 
  group_by(state, city,year) %>% 
  tally() %>%
  left_join(., controls) %>%
  filter(n>1) #find those city-state-years with multiple entries in our demog dataset

citycontrols <- tmp %>%
  group_by(state, city, year) %>%
  filter(Total_Population == min(Total_Population)) %>%
  select(state, city, year, Id2) %>%
  anti_join(controls,.) ## some city,state,years have duplicates with two different populations
                        ## pull out row that has minimum population bc it's usually off

citycontrols$city[citycontrols$city=='Anchorage municipality' & citycontrols$state=='Alaska'] <- 'Anchorage'
citycontrols$city[citycontrols$city=='Juneau and' & citycontrols$state=='Alaska'] <- 'Juneau'
citycontrols$city[grep('flintridge',citycontrols$city,ignore.case = T)] <- 'La Canada Flintridge'
citycontrols$city[citycontrols$city=='San Buenaventura (Ventura)' & citycontrols$state=='California'] <- 'Ventura'
citycontrols$state[citycontrols$state=='District of Columbia'] <- 'District Of Columbia'
citycontrols$city[citycontrols$city=='Athens-Clarke County unified government (balance)' & 
                citycontrols$state=='Georgia'] <- 'Athens-Clarke County'
citycontrols$city[citycontrols$city=='Urban Honolulu' & citycontrols$state=='Hawaii'] <- 'Honolulu'
citycontrols$city[citycontrols$city=='Boise City' & citycontrols$state=='Idaho'] <- 'Boise'
citycontrols$city[citycontrols$city=='Lexington-Fayette urban county' & citycontrols$state=='Kentucky'] <- 'Lexington'
citycontrols$city[citycontrols$city=='Louisville/Jefferson County metro government (balance)' & 
                citycontrols$state=='Kentucky'] <- 'Louisville'
crimerates$City[crimerates$City=="Las Vegas Metropolitan Police Department" & 
                  crimerates$State == 'Nevada'] <- 'Las Vegas'
citycontrols$city[citycontrols$city=='West Valley City' & citycontrols$state=='Utah'] <- 'West Valley City'

# join crime rates with controls and arrest data
crimecontrol <- left_join(crimerates,citycontrols,
                          by = c("City"="city","State"="state","year"="year")) %>%
  group_by(City,State) %>% 
  complete(year = full_seq(2009:2015, 1)) %>%
  inner_join(.,arrests,by=c("City"="City","State"="State","year"="YEAR")) 

# Some cities aren't matching easily b/c same city/state pairs are repeat (e.g. can be town,township,village,city,etc.)
# Use  population reported in crime data and population in control data to make best match
fix <- crimecontrol %>%
  group_by(City,State,year) %>%
  tally() %>%
  filter(n>1) %>%
  inner_join(.,crimecontrol) %>%
  mutate(pop_diff = Population - Total_Population) 

correct_match <- fix %>%
  group_by(City,State,year)%>%
  mutate(min_diff=min(abs(pop_diff))) %>%
  filter(min_diff == abs(pop_diff)) %>%
  select(City,State,year,Id2) %>%
  rename(Id2_keep = Id2)

# all city states that are not duplicates
controlsA <- anti_join(citycontrols,correct_match,
                       by = c("city"="City","state"="State","year"="year"))
# and merge non-matches together
controls <- inner_join(citycontrols,correct_match,
                        by = c("city"="City","state"="State","year"="year")) %>%
  filter(Id2==Id2_keep) %>%
  bind_rows(controlsA,.) %>%
  arrange(state,city,year) 

# repeat join crime rates with controls now that duplicates have been removed ------
crimecontrol <- left_join(crimerates,controls,
                          by = c("City"="city",
                                 "State"="state",
                                 "year"="year"))%>%
  group_by(City,State) %>% 
  complete(year = full_seq(2009:2015, 1)) %>%
  right_join(.,arrests,by=c("City"="City","State"="State","year"="YEAR")) %>%
  select(-Id2_keep) %>%
  arrange(State,City,year) 

save(crimecontrol,file='data/city-crime-controls.RData')
